source("../references/biostats.R")
source("../references/iLOO.R")
`%!in%` = Negate(`%in%`)
# Add all required libraries that are installed with install.packages() here
list.of.packages <- c("RCurl", "tidyverse", "vegan", "pheatmap", "pastecs", "factoextra", "FactoMineR", "RColorBrewer", "tibble", "reshape2", "plotly", "cowplot", "clipr", "janitor", "ggpubr", "forcats", "apeglm", "car", "vsn", "devtools", "grid", "gridGraphics", "Rfast", "dendextend", "RColorBrewer", "scales", "VennDiagram", "colorspace", "edgeR", "devtools", "ggpattern", "eulerr", "Rmisc", "pairwiseAdonis")
# Add all libraries that are installed using BiocManager here
bioconductor.packages <- c("DESeq2", "WGCNA")
# Install BiocManager if needed
if(!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
# Get names of all required packages that aren't installed
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
new.bioc.packages <- bioconductor.packages[!(bioconductor.packages %in% installed.packages()[, "Package"])]
# Install all new packages
if(length(new.packages)) install.packages(new.packages)
# if(length(new.bioc.packages)) BiocManager::install(new.bioc.packages)
# Load all required libraries
all.packages <- c(list.of.packages, bioconductor.packages)
lapply(all.packages, FUN = function(X) {
do.call("require", list(X))
})
# # Github packages
# install_github("pmartinezarbizu/pairwiseAdonis/pairwiseAdonis")
#### Load counts and sample info (generated in the "0-Get-data.Rmd" notebook)
load(file="../data/sample.info")
load(file = "../results/counts")
load(file = "../results/counts.t")
load(file = "../results/counts.ts")
load(file = "../results/counts.annot.tanner")
load( file = "../references/tanner.blast")
load(file = "../references/tanner.blast.GO")
data.frame(rowSums(counts.ts)) %>%
dplyr::rename(read.total = 1) %>%
rownames_to_column(var="sample") %>% #summary()
dplyr::summarise(mean=round(mean(read.total, na.rm=T)),
sd=round(sd(read.total, na.rm=T)),
se=round(sd/sqrt(length(sample))),
median=median(read.total),
min=min(read.total),
max=max(read.total)) %>% t() %>% as.data.frame() %>%
dplyr::mutate(V1=prettyNum(V1, big.mark = ",")) %>%
dplyr::rename("fragments.per.sample"="V1") #use this to average across all samples
## fragments.per.sample
## mean 25,331,103
## sd 5,316,473
## se 675,193
## median 24,728,512
## min 14,169,939
## max 45,153,229
fragments <- data.frame(rowSums(counts.ts)) %>%
dplyr::rename(read.total = 1) %>%
rownames_to_column(var="sample") %>%
#mutate(sample=as.numeric(sample)) %>%
left_join(sample.info, by=c("sample"="sampleID")) %>%
mutate(treatment=factor(treatment, ordered = T, levels=c("ambient", "moderate_short", "moderate_long", "severe_short", "severe_long")))
ggplotly(
ggplot(data = fragments %>%
arrange(OA)) +
geom_bar(aes(x=sample, y=read.total, fill=treatment), stat = "identity") +
ggtitle("Total # fragments by sample") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank()) + theme_minimal())
hist(log(fragments$read.total))
shapiro.test(log(fragments$read.total))
##
## Shapiro-Wilk normality test
##
## data: log(fragments$read.total)
## W = 0.94306, p-value = 0.006271
summary(aov(log(read.total) ~ treatment, fragments))
## Df Sum Sq Mean Sq F value Pr(>F)
## treatment 4 0.6594 0.16484 5.726 0.000605 ***
## Residuals 57 1.6409 0.02879
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov(log(read.total) ~ treatment, fragments))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = log(read.total) ~ treatment, data = fragments)
##
## $treatment
## diff lwr upr p adj
## moderate_short-ambient 0.28938055 0.08986761 0.48889349 0.0012666
## moderate_long-ambient 0.17242633 -0.01891192 0.36376459 0.0963304
## severe_short-ambient 0.01377824 -0.18134926 0.20890575 0.9996400
## severe_long-ambient 0.11003537 -0.07799420 0.29806493 0.4734599
## moderate_long-moderate_short -0.11695422 -0.31276280 0.07885436 0.4526584
## severe_short-moderate_short -0.27560231 -0.47511525 -0.07608937 0.0023615
## severe_long-moderate_short -0.17934519 -0.37192189 0.01323152 0.0794315
## severe_short-moderate_long -0.15864809 -0.34998635 0.03269017 0.1487312
## severe_long-moderate_long -0.06239097 -0.24648525 0.12170331 0.8739603
## severe_long-severe_short 0.09625712 -0.09177244 0.28428669 0.6035428
anova(lm(log(read.total) ~ OA, fragments))
## Analysis of Variance Table
##
## Response: log(read.total)
## Df Sum Sq Mean Sq F value Pr(>F)
## OA 2 0.51799 0.258996 8.5737 0.0005387 ***
## Residuals 59 1.78228 0.030208
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(lm(log(read.total) ~ duration, fragments))
## Analysis of Variance Table
##
## Response: log(read.total)
## Df Sum Sq Mean Sq F value Pr(>F)
## duration 1 0.00038 0.000377 0.0111 0.9165
## Residuals 48 1.62910 0.033940
anova(lm(log(read.total) ~ OA*duration, fragments))
## Analysis of Variance Table
##
## Response: log(read.total)
## Df Sum Sq Mean Sq F value Pr(>F)
## OA 1 0.32117 0.32117 12.6605 0.0008796 ***
## duration 1 0.00045 0.00045 0.0178 0.8944529
## OA:duration 1 0.14092 0.14092 5.5549 0.0227419 *
## Residuals 46 1.16694 0.02537
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(fragments, aes(x=OA, y=read.total)) + geom_boxplot() + theme_minimal()
ggplot(fragments, aes(x=duration, y=read.total)) + geom_boxplot() + theme_minimal()
ggplot(fragments, aes(x=treatment, y=read.total)) + geom_boxplot() + theme_minimal()
fragments %>% group_by(treatment) %>% tally()
## # A tibble: 5 × 2
## treatment n
## <ord> <int>
## 1 ambient 12
## 2 moderate_short 11
## 3 moderate_long 13
## 4 severe_short 12
## 5 severe_long 14
fragments %>% group_by(OA) %>% tally()
## # A tibble: 3 × 2
## OA n
## <fct> <int>
## 1 ambient 12
## 2 moderate 24
## 3 severe 26
fragments %>% group_by(duration) %>% tally()
## # A tibble: 3 × 2
## duration n
## <fct> <int>
## 1 long 27
## 2 short 23
## 3 <NA> 12
fragments %>% group_by(treatment) %>% tally() #%>% arrange(treatment, tank)
## # A tibble: 5 × 2
## treatment n
## <ord> <int>
## 1 ambient 12
## 2 moderate_short 11
## 3 moderate_long 13
## 4 severe_short 12
## 5 severe_long 14
prettyNum(ncol(counts.ts),big.mark = ",")
## [1] "11,491"
data.frame(rowSums(counts.ts != 0)) %>%
dplyr::rename(count.total = 1) %>%
rownames_to_column(var="sample") %>%
summarise(mean=round(mean(count.total, na.rm=T)),
sd=round(sd(count.total, na.rm=T)),
se=round(sd/sqrt(length(sample))),
median=median(count.total, na.rm=T),
min=min(count.total, na.rm=T),
max=max(count.total, na.rm=T)) %>% t() %>% as.data.frame() %>%
mutate(V1=prettyNum(V1, big.mark = ",")) %>%
dplyr::rename("genes.per.sample"="V1")
## genes.per.sample
## mean 11,140
## sd 103
## se 13
## median 11,173
## min 10,779
## max 11,267
#ggplotly(
ggplot(data = data.frame(rowSums(counts.t != 0, na.rm=TRUE)) %>%
dplyr::rename(count.total = 1) %>%
rownames_to_column(var="sample")) +
geom_bar(aes(x=sample, y=count.total), stat = "identity") + ggtitle("Total # genes by sample") +
theme(axis.text.x = element_blank(), axis.ticks.x = element_blank())#)
### Prep data for analyses
# IMPORTANT NOTE: to use data for ALL genes, use object "counts.t". To remove low-frequency genes, use object "counts.ts" (recommended option)
# merge count data with sample key, reset row names as sample names, and arrange by infection, then temperature, then day
counts.tk <- merge(x=sample.info[,c("sampleID", "OA", "duration", "treatment")], by.x="sampleID", y=counts.ts, by.y="row.names") %>%
arrange(treatment) %>% column_to_rownames(var="sampleID") %>% droplevels()
save(counts.tk, file = "../results/counts.tk")
head(counts.tk[1:24]) #check out results of merge/arrange
## OA duration treatment gene_22760 gene_22761 gene_22085 gene_22086
## S1 ambient <NA> ambient 114 7 98 144
## S10 ambient <NA> ambient 20 43 227 304
## S11 ambient <NA> ambient 259 27 107 174
## S12 ambient <NA> ambient 494 14 128 175
## S2 ambient <NA> ambient 239 4 169 233
## S3 ambient <NA> ambient 168 1 223 275
## gene_22090 gene_22092 gene_17987 gene_17988 gene_17989 gene_20323
## S1 98 96 232 194 62 83
## S10 131 165 411 463 146 195
## S11 123 170 308 266 78 112
## S12 115 98 180 238 66 57
## S2 130 158 327 339 116 104
## S3 127 162 441 383 122 324
## gene_15688 gene_24921 gene_46880 gene_46384 gene_41869 gene_3171 gene_3170
## S1 73 75 28 18 40 53 36
## S10 177 90 48 29 66 148 75
## S11 131 106 34 22 50 62 45
## S12 100 139 30 11 53 49 30
## S2 141 76 25 14 39 96 50
## S3 161 90 21 13 39 95 39
## gene_3172 gene_3157 gene_4558 gene_4559
## S1 30 43 131 328
## S10 105 43 166 536
## S11 41 86 166 529
## S12 24 44 335 402
## S2 78 34 105 344
## S3 74 57 93 401
# NOTE: It is absolutely critical that the **columns of the count matrix** and the **rows of the column data (information about samples)** are in the same order. DESeq2 will not make guesses as to which column of the count matrix belongs to which row of the column data, these must be provided to DESeq2 already in consistent order.
all(rownames(counts.tk) == counts.tk[-1:-3] %>% t() %>% colnames()) #check that rownames of untransformed matrix match column names of transformed matrix. Should print 'TRUE'
## [1] TRUE
##### If using counts which included multimapped counted fractionally, need to round to the nearest integer for DESeq2.
#_Not needed for gene counts produced by STAR aligner_
#counts.tk <- counts.tk %>% mutate_if(is.numeric, round)
dds.treatment <- DESeqDataSetFromMatrix(countData = counts.tk[-1:-3] %>% t(),
colData = counts.tk[,"treatment", drop=FALSE] ,
design = ~ treatment)
The variance stabilizing transformation is recommended prior to visualizing expression data and running PCA’s
vsd.treatment <- varianceStabilizingTransformation(dds.treatment, blind=FALSE)
save(vsd.treatment, file = "../results/vsd.treatment")
# Save variance-stabilization normalized gene counts x sample matrix, annotated for later use
counts.trans <- assay(vsd.treatment) %>% t() %>% as.matrix() %>% as.data.frame() %>%
rownames_to_column(var = "sampleID") %>%
left_join(sample.info[,c("sampleID", "treatment")]) %>%
column_to_rownames(var = "sampleID") %>%
dplyr::select(treatment, everything()) %>%
mutate(treatment=factor(treatment, ordered = T,
levels=c("ambient","moderate_short","moderate_long","severe_short","severe_long")))
save(counts.trans, file = "../results/counts.trans")
# calculate CV for each gene across all samples
temp <- assay(vsd.treatment) %>% as.data.frame() %>%
rownames_to_column(var = "gene") %>%
rowwise() %>%
dplyr::mutate(
sd = sd(c_across(-gene)),
mean = mean(c_across(-gene)),
cv = sd/mean) %>%
#column_to_rownames("gene") %>%
arrange(dplyr::desc(cv))
# subset top 10% of most variable genes
most.variable <- temp %>%
head(n=round(0.1*nrow(assay(vsd.treatment)))) %>%
select(gene) %>% unlist() %>% as.vector()
# Generate heat map / cluster those top 10% most variable genes - any pattern in the clustering?
cluster.most.variable <- pheatmap(t(assay(vsd.treatment))[,most.variable],
Rowv=NA, Colv=NA, na.rm = TRUE, xlab = NA, scale="column",
show_colnames =FALSE, cluster_cols = FALSE, cluster_rows = TRUE,
annotation_colors = list(treatment=
c(`ambient`= "navyblue",
`moderate_short`="darkgreen", `moderate_long`="yellow3",
`severe_short`="sienna2", `severe_long`="firebrick4")),
annotation_row=as.data.frame(colData(vsd.treatment)[,c("treatment"), drop=FALSE]),
main = "gene counts - top 10% most variable\n(VSD-transformed)")
NOTE: this tree was generated by the pheatmap function in the previous code chunk
# Just plot hierarchical cluster, color-code sample by treatment, or by tank number
cluster.dendo <- as.dendrogram(cluster.most.variable$tree_row)
##bare bones dendogram
#plot(cluster.dendo)
# #A sub tree - looking at branches
# par(cex = 1)
# plot(cluster.dendo[[1]], horiz = TRUE)
order.dendrogram(cluster.dendo) # to find out the order of the sample labels
## [1] 26 55 52 2 6 41 61 10 36 51 7 22 32 33 45 53 57 58 38 59 60 11 20 34 1
## [26] 8 28 40 39 62 9 54 30 4 13 23 21 24 18 27 16 25 5 48 12 3 37 42 35 44
## [51] 50 56 15 46 47 14 31 29 43 49 17 19
# Create a dataframe ordered the same as the dendogram for annotation purposes
dendo.df <- sample.info[match(cluster.most.variable$tree_row$labels, sample.info$sampleID),c("sampleID", "treatment")]
all(dendo.df$sample == cluster.most.variable$tree_row$labels) # Should == TRUE
## [1] TRUE
# Create color coding vectors
colorCodes.treat <- c(`ambient`= "navyblue",
`moderate_short`="darkgreen", `moderate_long`="yellow3",
`severe_short`="sienna2", `severe_long`="firebrick4")
# Plot dendogram color-code by treatment
labels(cluster.dendo) <- dendo.df$sampleID[order.dendrogram(cluster.dendo)]
labels_colors(cluster.dendo) <- colorCodes.treat[as.character(dendo.df$treatment[order.dendrogram(cluster.dendo)])]
plot(cluster.dendo, xlab="Sample Number", cex.lab=0.9)
legend("topright", title = "Treatment", bty="n", legend =
c("ambient","moderate_short","moderate_long","severe_short","severe_long"),
fill = c("navyblue","darkgreen","yellow3","sienna2", "firebrick4"))
# To view the the guts of the PlotPCA function in DESeq2, execute this code chunk. It reveals that, by default, it only uses the top 500 most influential genes to plot the PCA!
#getMethod("plotPCA","DESeqTransform")
NOTE: Hover over points to see the sample numbers
# Generate PCA with points color coded by pH Treatment with various number of top genes to use for principal components, selected by highest row variance
# Top 500 most influential genes
ggplotly(
plotPCA(vsd.treatment, intgroup="treatment", ntop=500) +
ggtitle("PCA by Treatment (var-stabilizing transformed)\ntop 500 genes") +
geom_point(size=2, aes(text=colnames(vsd.treatment))) +
scale_color_manual(values=c("ambient"= "navyblue",
"moderate_short"="darkgreen", "moderate_long"="yellow3",
"severe_short"="sienna2", "severe_long"="firebrick4")) +
theme_minimal()+ stat_ellipse() , tooltip = "text")
# Top 2000 most influential genes
ggplotly(
plotPCA(vsd.treatment, intgroup="treatment", ntop=2000) +
ggtitle("PCA by Treatment (var-stabilizing transformed)\ntop 2000 genes") +
geom_point(size=2, aes(text=colnames(vsd.treatment))) +
scale_color_manual(values=c("ambient"= "navyblue",
"moderate_short"="darkgreen", "moderate_long"="yellow3",
"severe_short"="sienna2", "severe_long"="firebrick4")) +
theme_minimal()+ stat_ellipse() , tooltip = "text")
# All genes!
#ggplotly(
plotPCA(vsd.treatment, intgroup="treatment", ntop=nrow(assay(vsd.treatment))) +
ggtitle("PCA by Treatment (var-stabilizing transformed)\nall genes") +
geom_point(size=3, aes(text=colnames(vsd.treatment))) +
scale_color_manual(values=c("ambient"= "navyblue",
"moderate_short"="darkgreen", "moderate_long"="yellow3",
"severe_short"="sienna2", "severe_long"="firebrick4")) +
theme_minimal()+ stat_ellipse() #, tooltip = "text")
# PCA.data.treatment <- plotPCA(vsd.treatment, intgroup=c("treatment"), returnData=TRUE)
# save(PCA.data.treatment, file="../results/PCA.data.treatment")
This enables screeplot to ID significant PCs, calculates variance, etc.
#pca.princomp <- prcomp(cov(assay(vsd.pH)), scale=F) #scale=F for variance-covariance matrix
pca.princomp <- prcomp(t(assay(vsd.treatment))) # using the same code that's under the hood of PlotPCA but using all genes
pca.eigenval(pca.princomp) #The Proporation of Variance = %variance
## Importance of components:
## PC1 PC2 PC3 PC4
## Variance(eigenvalue) 1278.6133648 417.7152980 393.7549427 200.9273154
## Proportion of Variance 0.2682302 0.0876292 0.0826027 0.0421510
## Cumulative Proportion 0.2682302 0.3558594 0.4384621 0.4806131
## Broken-stick value 4.7123929 3.7123929 3.2123929 2.8790596
## PC5 PC6 PC7 PC8
## Variance(eigenvalue) 183.4703175 162.1425339 131.8860721 116.3797974
## Proportion of Variance 0.0384888 0.0340146 0.0276673 0.0244144
## Cumulative Proportion 0.5191018 0.5531164 0.5807838 0.6051982
## Broken-stick value 2.6290596 2.4290596 2.2623929 2.1195357
## PC9 PC10 PC11 PC12 PC13
## Variance(eigenvalue) 83.2244056 70.5070223 62.9660854 59.4698005 57.9788822
## Proportion of Variance 0.0174590 0.0147911 0.0132092 0.0124757 0.0121629
## Cumulative Proportion 0.6226572 0.6374483 0.6506574 0.6631331 0.6752961
## Broken-stick value 1.9945357 1.8834246 1.7834246 1.6925155 1.6091822
## PC14 PC15 PC16 PC17 PC18
## Variance(eigenvalue) 53.4711597 50.0573103 48.5971212 46.0268886 45.4766016
## Proportion of Variance 0.0112173 0.0105011 0.0101948 0.0096556 0.0095402
## Cumulative Proportion 0.6865133 0.6970145 0.7072093 0.7168649 0.7264051
## Broken-stick value 1.5322591 1.4608306 1.3941639 1.3316639 1.2728404
## PC19 PC20 PC21 PC22 PC23
## Variance(eigenvalue) 43.9067697 43.5326922 42.2451034 39.2892742 39.0733730
## Proportion of Variance 0.0092109 0.0091324 0.0088623 0.0082422 0.0081969
## Cumulative Proportion 0.7356159 0.7447483 0.7536106 0.7618528 0.7700497
## Broken-stick value 1.2172848 1.1646532 1.1146532 1.0670342 1.0215796
## PC24 PC25 PC26 PC27 PC28
## Variance(eigenvalue) 38.0791959 37.4023795 36.5726346 35.8129517 35.3161075
## Proportion of Variance 0.0079883 0.0078463 0.0076723 0.0075129 0.0074087
## Cumulative Proportion 0.7780380 0.7858843 0.7935566 0.8010695 0.8084782
## Broken-stick value 0.9781014 0.9364347 0.8964347 0.8579732 0.8209361
## PC29 PC30 PC31 PC32 PC33
## Variance(eigenvalue) 34.4481880 33.6746437 33.3057780 32.8500197 32.4635482
## Proportion of Variance 0.0072266 0.0070643 0.0069870 0.0068913 0.0068103
## Cumulative Proportion 0.8157048 0.8227692 0.8297561 0.8366475 0.8434577
## Broken-stick value 0.7852218 0.7507391 0.7174058 0.6851477 0.6538977
## PC34 PC35 PC36 PC37 PC38
## Variance(eigenvalue) 32.0611927 31.3890215 30.9897222 30.8702842 30.6381446
## Proportion of Variance 0.0067259 0.0065849 0.0065011 0.0064760 0.0064273
## Cumulative Proportion 0.8501836 0.8567685 0.8632695 0.8697456 0.8761729
## Broken-stick value 0.6235947 0.5941829 0.5656115 0.5378337 0.5108067
## PC39 PC40 PC41 PC42 PC43
## Variance(eigenvalue) 29.4863033 29.2467869 28.6886558 28.3800150 27.9185756
## Proportion of Variance 0.0061857 0.0061355 0.0060184 0.0059536 0.0058568
## Cumulative Proportion 0.8823586 0.8884941 0.8945124 0.9004661 0.9063229
## Broken-stick value 0.4844909 0.4588498 0.4338498 0.4094596 0.3856501
## PC44 PC45 PC46 PC47 PC48
## Variance(eigenvalue) 27.6088934 27.0919523 27.0708401 26.8234106 26.3636479
## Proportion of Variance 0.0057919 0.0056834 0.0056790 0.0056271 0.0055306
## Cumulative Proportion 0.9121147 0.9177981 0.9234771 0.9291042 0.9346348
## Broken-stick value 0.3623943 0.3396670 0.3174448 0.2957056 0.2744290
## PC49 PC50 PC51 PC52 PC53
## Variance(eigenvalue) 26.0594552 25.5637854 25.3217066 25.1792380 24.8850941
## Proportion of Variance 0.0054668 0.0053628 0.0053120 0.0052822 0.0052204
## Cumulative Proportion 0.9401016 0.9454644 0.9507765 0.9560586 0.9612791
## Broken-stick value 0.2535957 0.2331875 0.2131875 0.1935797 0.1743489
## PC54 PC55 PC56 PC57 PC58
## Variance(eigenvalue) 24.3344616 24.0766732 23.4110228 23.2293435 22.8689086
## Proportion of Variance 0.0051049 0.0050509 0.0049112 0.0048731 0.0047975
## Cumulative Proportion 0.9663840 0.9714349 0.9763461 0.9812192 0.9860167
## Broken-stick value 0.1554810 0.1369625 0.1187807 0.1009235 0.0833797
## PC59 PC60 PC61 PC62
## Variance(eigenvalue) 22.6844284 22.2705331 21.7015548 0.000000
## Proportion of Variance 0.0047588 0.0046720 0.0045526 0.000000
## Cumulative Proportion 0.9907754 0.9954474 1.0000000 1.000000
## Broken-stick value 0.0661383 0.0491891 0.0325225 0.016129
pc.percent <- pca.eigenval(pca.princomp)[2,1:6]*100
## Importance of components:
screeplot(pca.princomp, bstick=TRUE) #looks like PC 1-3 are significant
pc.percent[1:3] %>% sum() #total variance explained by first 3 (significant) axes
## [1] 43.84621
#### Generate dataframe with prcomp results
tab.expr <- data.frame(sample.id = colnames(assay(vsd.treatment)),
EV1.all = pca.princomp$x[,1], # the first eigenvector
EV2.all = pca.princomp$x[,2], # the second eigenvector
EV3.all = pca.princomp$x[,3], # the third eigenvector
stringsAsFactors = FALSE)
tab.expr.annot <- left_join(tab.expr, sample.info, by=c("sample.id"="sampleID")) %>% droplevels() #add treatment info to PC results
#pdf(file="../results/PCA-1x2.pdf", height = 5.25, width = 7.5)
ggscatter(tab.expr.annot,
x="EV1.all", y="EV2.all", col="treatment", shape="OA", size=3.5, alpha=0.85,
ellipse = FALSE, star.plot = FALSE) +
theme_minimal() + ggtitle("Global gene expression PC1xPC2") +
xlab(paste("PC1 (", round(pc.percent[1], digits = 1), "%)", sep="")) +
ylab(paste("PC2 (", round(pc.percent[2], digits = 1), "%)", sep="")) +
theme(legend.position = "right", legend.text=element_text(size=8), legend.title=element_text(size=9)) +
scale_color_manual(name="Treatment",
values=c("ambient"= "navyblue",
"moderate_short"="darkgreen",
"moderate_long"="yellow3",
"severe_short"="sienna2",
"severe_long"="firebrick4"),
labels=c("ambient"= "Control",
"moderate_short"="pH 7.8, Short Exposure",
"moderate_long"="pH 7.8, Long Exposure",
"severe_short"="pH 7.5, Short Exposure",
"severe_long"="pH 7.5, Long Exposure")) +
scale_shape_manual(guide="none", values=c("ambient"=19, "moderate"=17, "severe"=15)) +
guides(colour = guide_legend(override.aes = list(size=3.5, linetype="blank", shape=c(19, 17, 17, 15, 15))))
#dev.off()
#pdf(file="../results/PCA-1x3.pdf", height = 5.25, width = 7.5)
ggscatter(tab.expr.annot,
x="EV1.all", y="EV3.all", col="treatment", shape="OA", size=3.5, alpha=0.85,
ellipse = FALSE, star.plot = FALSE) +
theme_minimal() + ggtitle("Global gene expression PC1xPC3") +
xlab(paste("PC1 (", round(pc.percent[1], digits = 1), "%)", sep="")) +
ylab(paste("PC3 (", round(pc.percent[3], digits = 1), "%)", sep="")) +
theme(legend.position = "right", legend.text=element_text(size=8), legend.title=element_text(size=9)) +
scale_color_manual(name="Treatment",
values=c("ambient"= "navyblue",
"moderate_short"="darkgreen",
"moderate_long"="yellow3",
"severe_short"="sienna2",
"severe_long"="firebrick4"),
labels=c("ambient"= "Control",
"moderate_short"="pH 7.8, Short Exposure",
"moderate_long"="pH 7.8, Long Exposure",
"severe_short"="pH 7.5, Short Exposure",
"severe_long"="pH 7.5, Long Exposure")) +
scale_shape_manual(guide="none", values=c("ambient"=19, "moderate"=17, "severe"=15)) +
guides(colour = guide_legend(override.aes = list(size=3.5, linetype="blank", shape=c(19, 17, 17, 15, 15))))
#dev.off()
#pdf(file="../results/PCA-2x3.pdf", height = 5.25, width = 7.5)
ggscatter(tab.expr.annot,
x="EV2.all", y="EV3.all", col="treatment", shape="OA", size=3.5, alpha=0.85,
ellipse = FALSE, star.plot = FALSE) +
theme_minimal() + ggtitle("Global gene expression PC2xPC3") +
xlab(paste("PC2 (", round(pc.percent[2], digits = 1), "%)", sep="")) +
ylab(paste("PC3 (", round(pc.percent[3], digits = 1), "%)", sep="")) +
theme(legend.position = "right", legend.text=element_text(size=8), legend.title=element_text(size=9)) +
scale_color_manual(name="Treatment",
values=c("ambient"= "navyblue",
"moderate_short"="darkgreen",
"moderate_long"="yellow3",
"severe_short"="sienna2",
"severe_long"="firebrick4"),
labels=c("ambient"= "Control",
"moderate_short"="pH 7.8, Short Exposure",
"moderate_long"="pH 7.8, Long Exposure",
"severe_short"="pH 7.5, Short Exposure",
"severe_long"="pH 7.5, Long Exposure")) +
scale_shape_manual(guide="none", values=c("ambient"=19, "moderate"=17, "severe"=15)) +
guides(colour = guide_legend(override.aes = list(size=3.5, linetype="blank", shape=c(19, 17, 17, 15, 15))))
#dev.off()
pca.princomp.var <- prcomp(t(assay(vsd.treatment)[most.variable,])) # using the same code that's under the hood of PlotPCA
pca.eigenval(pca.princomp.var) #The Proporation of Variance = %variance
## Importance of components:
## PC1 PC2 PC3 PC4
## Variance(eigenvalue) 707.2304441 332.2734110 225.2619886 118.4688183
## Proportion of Variance 0.3150528 0.1480192 0.1003484 0.0527748
## Cumulative Proportion 0.3150528 0.4630720 0.5634204 0.6161952
## Broken-stick value 4.7123929 3.7123929 3.2123929 2.8790596
## PC5 PC6 PC7 PC8 PC9
## Variance(eigenvalue) 82.0108313 58.9954983 50.0824286 33.4080096 28.3348510
## Proportion of Variance 0.0365337 0.0262810 0.0223104 0.0148824 0.0126224
## Cumulative Proportion 0.6527289 0.6790099 0.7013203 0.7162027 0.7288252
## Broken-stick value 2.6290596 2.4290596 2.2623929 2.1195357 1.9945357
## PC10 PC11 PC12 PC13 PC14
## Variance(eigenvalue) 26.2804821 23.4192726 23.2255705 22.1982289 20.1591584
## Proportion of Variance 0.0117073 0.0104327 0.0103464 0.0098887 0.0089804
## Cumulative Proportion 0.7405324 0.7509651 0.7613115 0.7712002 0.7801806
## Broken-stick value 1.8834246 1.7834246 1.6925155 1.6091822 1.5322591
## PC15 PC16 PC17 PC18 PC19
## Variance(eigenvalue) 18.8994541 18.7351274 17.6345843 17.3859909 16.4888519
## Proportion of Variance 0.0084192 0.0083460 0.0078558 0.0077450 0.0073454
## Cumulative Proportion 0.7885998 0.7969459 0.8048016 0.8125466 0.8198920
## Broken-stick value 1.4608306 1.3941639 1.3316639 1.2728404 1.2172848
## PC20 PC21 PC22 PC23 PC24
## Variance(eigenvalue) 15.7071416 15.4145265 15.0747853 14.2694287 13.9591999
## Proportion of Variance 0.0069971 0.0068668 0.0067154 0.0063567 0.0062185
## Cumulative Proportion 0.8268891 0.8337559 0.8404713 0.8468280 0.8530464
## Broken-stick value 1.1646532 1.1146532 1.0670342 1.0215796 0.9781014
## PC25 PC26 PC27 PC28 PC29
## Variance(eigenvalue) 13.4359831 13.1699022 12.9891172 12.5184569 12.1486413
## Proportion of Variance 0.0059854 0.0058669 0.0057863 0.0055766 0.0054119
## Cumulative Proportion 0.8590318 0.8648986 0.8706850 0.8762616 0.8816735
## Broken-stick value 0.9364347 0.8964347 0.8579732 0.8209361 0.7852218
## PC30 PC31 PC32 PC33 PC34
## Variance(eigenvalue) 11.8194865 11.2711133 11.1031844 10.9081864 10.7906023
## Proportion of Variance 0.0052653 0.0050210 0.0049462 0.0048593 0.0048069
## Cumulative Proportion 0.8869388 0.8919598 0.8969060 0.9017653 0.9065722
## Broken-stick value 0.7507391 0.7174058 0.6851477 0.6538977 0.6235947
## PC35 PC36 PC37 PC38 PC39
## Variance(eigenvalue) 10.4054008 10.2423509 9.8773929 9.6527840 9.5140338
## Proportion of Variance 0.0046353 0.0045627 0.0044001 0.0043001 0.0042383
## Cumulative Proportion 0.9112075 0.9157702 0.9201704 0.9244704 0.9287087
## Broken-stick value 0.5941829 0.5656115 0.5378337 0.5108067 0.4844909
## PC40 PC41 PC42 PC43 PC44
## Variance(eigenvalue) 9.2568041 9.0674484 8.8981152 8.4811703 8.4329872
## Proportion of Variance 0.0041237 0.0040393 0.0039639 0.0037781 0.0037567
## Cumulative Proportion 0.9328324 0.9368717 0.9408356 0.9446137 0.9483704
## Broken-stick value 0.4588498 0.4338498 0.4094596 0.3856501 0.3623943
## PC45 PC46 PC47 PC48 PC49
## Variance(eigenvalue) 8.2720775 7.9545000 7.8875604 7.8274094 7.6718839
## Proportion of Variance 0.0036850 0.0035435 0.0035137 0.0034869 0.0034176
## Cumulative Proportion 0.9520554 0.9555989 0.9591126 0.9625995 0.9660171
## Broken-stick value 0.3396670 0.3174448 0.2957056 0.2744290 0.2535957
## PC50 PC51 PC52 PC53 PC54
## Variance(eigenvalue) 7.2871059 7.1603395 6.9113492 6.8289192 6.6452737
## Proportion of Variance 0.0032462 0.0031897 0.0030788 0.0030421 0.0029603
## Cumulative Proportion 0.9692633 0.9724531 0.9755319 0.9785740 0.9815343
## Broken-stick value 0.2331875 0.2131875 0.1935797 0.1743489 0.1554810
## PC55 PC56 PC57 PC58 PC59
## Variance(eigenvalue) 6.5279829 6.3965981 6.1602060 6.0503503 5.7049364
## Proportion of Variance 0.0029080 0.0028495 0.0027442 0.0026953 0.0025414
## Cumulative Proportion 0.9844424 0.9872919 0.9900361 0.9927314 0.9952728
## Broken-stick value 0.1369625 0.1187807 0.1009235 0.0833797 0.0661383
## PC60 PC61 PC62
## Variance(eigenvalue) 5.4602775 5.1513878 0.000000
## Proportion of Variance 0.0024324 0.0022948 0.000000
## Cumulative Proportion 0.9977052 1.0000000 1.000000
## Broken-stick value 0.0491891 0.0325225 0.016129
pc.percent.var <- pca.eigenval(pca.princomp.var)[2,1:5]*100
## Importance of components:
pc.percent.var[1:3] %>% sum()
## [1] 56.34204
screeplot(pca.princomp.var, bstick=TRUE) #looks like PC 1-3 are significant
tab.expr.var <- data.frame(sample.id = colnames(assay(vsd.treatment[most.variable,])),
EV1.all = pca.princomp.var$x[,1], # the first eigenvector
EV2.all = pca.princomp.var$x[,2], # the second eigenvector
EV3.all = pca.princomp.var$x[,3], # the third eigenvector
stringsAsFactors = FALSE)
#shapiro.test(pca.princomp$x) #sample size too large for shapiro test which is weird
#hist(pca.princomp$x) #normal? hard to say, maybe
tab.expr.annot.var <- left_join(tab.expr.var, sample.info, by=c("sample.id"="sampleID")) %>% droplevels()
# GGSCATTER with ellipses and stars - PC1 PC2
ggscatter(tab.expr.annot.var,
x="EV1.all", y="EV2.all", col="treatment", shape="OA", size=3.5, alpha=0.85,
ellipse = FALSE, star.plot = FALSE) +
theme_minimal() + ggtitle("Gene Expression PC1xPC2\n10% most variable genes") +
xlab(paste("PC1 (", round(pc.percent.var[1], digits = 1), "%)", sep="")) +
ylab(paste("PC2 (", round(pc.percent.var[2], digits = 1), "%)", sep="")) +
labs(color="OA Treatment") +
theme(legend.position = "right") +
scale_color_manual(name="Treatment",
values=c("ambient"= "navyblue",
"moderate_short"="darkgreen",
"moderate_long"="yellow3",
"severe_short"="sienna2",
"severe_long"="firebrick4"),
labels=c("ambient"= "Control",
"moderate_short"="pH 7.8, Short Exposure",
"moderate_long"="pH 7.8, Long Exposure",
"severe_short"="pH 7.5, Short Exposure",
"severe_long"="pH 7.5, Long Exposure")) +
scale_shape_manual(guide="none", values=c("ambient"=19, "moderate"=17, "severe"=15)) +
guides(colour = guide_legend(override.aes = list(size=3.5, linetype="blank", shape=c(19, 17, 17, 15, 15))))
# GGSCATTER with ellipses and stars - PC1 PC3
ggscatter(tab.expr.annot.var,
x="EV1.all", y="EV3.all", col="treatment", shape="OA", size=3.5, alpha=0.85,
ellipse = FALSE, star.plot = FALSE) +
theme_minimal() + ggtitle("Gene Expression PC1xPC3\n10% most variable genes") +
xlab(paste("PC1 (", round(pc.percent.var[1], digits = 1), "%)", sep="")) +
ylab(paste("PC3 (", round(pc.percent.var[3], digits = 1), "%)", sep="")) +
labs(color="OA Treatment") +
theme(legend.position = "right") +
scale_color_manual(name="Treatment",
values=c("ambient"= "navyblue",
"moderate_short"="darkgreen",
"moderate_long"="yellow3",
"severe_short"="sienna2",
"severe_long"="firebrick4"),
labels=c("ambient"= "Control",
"moderate_short"="pH 7.8, Short Exposure",
"moderate_long"="pH 7.8, Long Exposure",
"severe_short"="pH 7.5, Short Exposure",
"severe_long"="pH 7.5, Long Exposure")) +
scale_shape_manual(guide="none", values=c("ambient"=19, "moderate"=17, "severe"=15)) +
guides(colour = guide_legend(override.aes = list(size=3.5, linetype="blank", shape=c(19, 17, 17, 15, 15))))
# GGSCATTER with ellipses and stars - PC2 PC3
ggscatter(tab.expr.annot.var,
x="EV2.all", y="EV3.all", col="treatment", shape="OA", size=3.5, alpha=0.85,
ellipse = FALSE, star.plot = FALSE) +
theme_minimal() + ggtitle("Gene Expression PC2xPC3\n10% most variable genes") +
xlab(paste("PC2 (", round(pc.percent.var[2], digits = 1), "%)", sep="")) +
ylab(paste("PC3 (", round(pc.percent.var[3], digits = 1), "%)", sep="")) +
labs(color="OA Treatment") +
theme(legend.position = "right") +
scale_color_manual(name="Treatment",
values=c("ambient"= "navyblue",
"moderate_short"="darkgreen",
"moderate_long"="yellow3",
"severe_short"="sienna2",
"severe_long"="firebrick4"),
labels=c("ambient"= "Control",
"moderate_short"="pH 7.8, Short Exposure",
"moderate_long"="pH 7.8, Long Exposure",
"severe_short"="pH 7.5, Short Exposure",
"severe_long"="pH 7.5, Long Exposure")) +
scale_shape_manual(guide="none", values=c("ambient"=19, "moderate"=17, "severe"=15)) +
guides(colour = guide_legend(override.aes = list(size=3.5, linetype="blank", shape=c(19, 17, 17, 15, 15))))
# Effect of treatment
vsd.t <- t(assay(vsd.treatment)) %>% as.data.frame()
perm <- adonis(vsd.t ~ treatment, data=sample.info %>% remove_rownames() %>% column_to_rownames("sampleID"), permutations = 1000, method="bray")
# Pairwise comparisons
vsd.t <- sample.info %>% dplyr::select(sampleID, OA, duration, treatment) %>% left_join(t(assay(vsd.treatment)) %>% as.data.frame() %>% rownames_to_column("sampleID"))
# Clustering by OA not considering exposure time?
pairwise.adonis(vsd.t[,-1:-4], vsd.t$OA, perm = 1000)
## pairs Df SumsOfSqs F.Model R2 p.value p.adjusted
## 1 ambient vs severe 1 0.001556803 1.843721 0.04871933 0.049950050 0.14985015
## 2 ambient vs moderate 1 0.002722418 3.312499 0.08877720 0.003996004 0.01198801
## 3 severe vs moderate 1 0.001275948 1.580725 0.03188185 0.066933067 0.20079920
## sig
## 1
## 2 .
## 3
# Ambient (i.e. control) vs. moderate appear to differ in multivariate space (not considering exposure time)
# Clustering by OA considering exposure time?
pairwise.adonis(vsd.t[,-1:-4], vsd.t$treatment, perm = 1000)
## pairs Df SumsOfSqs F.Model R2
## 1 ambient vs severe_long 1 0.001807829 2.180610 0.08329104
## 2 ambient vs severe_short 1 0.001143298 1.322092 0.05668840
## 3 ambient vs moderate_long 1 0.003029834 4.027043 0.14900052
## 4 ambient vs moderate_short 1 0.001834446 2.051760 0.08900664
## 5 severe_long vs severe_short 1 0.001273684 1.581716 0.06182994
## 6 severe_long vs moderate_long 1 0.001115312 1.583724 0.05957493
## 7 severe_long vs moderate_short 1 0.001173246 1.414508 0.05793720
## 8 severe_short vs moderate_long 1 0.002112709 2.903888 0.11210241
## 9 severe_short vs moderate_short 1 0.001099456 1.268272 0.05695422
## 10 moderate_long vs moderate_short 1 0.001660921 2.216647 0.09153402
## p.value p.adjusted sig
## 1 0.043956044 0.43956044
## 2 0.181818182 1.00000000
## 3 0.000999001 0.00999001 *
## 4 0.043956044 0.43956044
## 5 0.087912088 0.87912088
## 6 0.091908092 0.91908092
## 7 0.108891109 1.00000000
## 8 0.002997003 0.02997003 .
## 9 0.183816184 1.00000000
## 10 0.007992008 0.07992008
# Yes, ambient vs moderate_long differ, and severe_short vs moderate_long
This is the core DESeq2 analysis!
# these are the treatment "levels" that will be used to run pairwise contrasts
levels(dds.DESeq.treatment$treatment)
## [1] "ambient" "moderate_long" "moderate_short" "severe_long"
## [5] "severe_short"
print("Comparison: Ambient pH vs. moderate-short")
## [1] "Comparison: Ambient pH vs. moderate-short"
summary(res.mod.short <- results(dds.DESeq.treatment, contrast=c("treatment", "ambient", "moderate_short"), alpha=0.05))
##
## out of 11491 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 533, 4.6%
## LFC < 0 (down) : 529, 4.6%
## outliers [1] : 0, 0%
## low counts [2] : 1337, 12%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, 3C:", sum(res.mod.short$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, 3C: 1062"
print("Comparison: Ambient pH vs. moderate-long")
## [1] "Comparison: Ambient pH vs. moderate-long"
summary(res.mod.long <- results(dds.DESeq.treatment, contrast=c("treatment", "ambient", "moderate_long"), alpha=0.05))
##
## out of 11491 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1072, 9.3%
## LFC < 0 (down) : 917, 8%
## outliers [1] : 0, 0%
## low counts [2] : 1337, 12%
## (mean count < 7)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, 3C:", sum(res.mod.long$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, 3C: 1989"
print("Comparison: Ambient pH vs. severe-short")
## [1] "Comparison: Ambient pH vs. severe-short"
summary(res.sev.short <- results(dds.DESeq.treatment, contrast=c("treatment", "ambient", "severe_short"), alpha=0.05))
##
## out of 11491 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 20, 0.17%
## LFC < 0 (down) : 27, 0.23%
## outliers [1] : 0, 0%
## low counts [2] : 7575, 66%
## (mean count < 319)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, 3C:", sum(res.sev.short$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, 3C: 47"
print("Comparison: Ambient pH vs. severe-long")
## [1] "Comparison: Ambient pH vs. severe-long"
summary(res.sev.long <- results(dds.DESeq.treatment, contrast=c("treatment", "ambient", "severe_long"), alpha=0.05))
##
## out of 11491 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 287, 2.5%
## LFC < 0 (down) : 239, 2.1%
## outliers [1] : 0, 0%
## low counts [2] : 2451, 21%
## (mean count < 16)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
paste("No. of genes differentially expressed (padj<0.05) by pH, 3C:", sum(res.sev.long$padj < 0.05, na.rm=TRUE))
## [1] "No. of genes differentially expressed (padj<0.05) by pH, 3C: 526"
nrow(res.sev.long)
## [1] 11491
#### Save differential expression results to objects
save(res.mod.short, file = "../results/deseq2/res.mod.short")
save(res.mod.long, file = "../results/deseq2/res.mod.long")
save(res.sev.short, file = "../results/deseq2/res.sev.short")
save(res.sev.long, file = "../results/deseq2/res.sev.long")
#### Subset the DESeq results for only those statistically sign. ones
# Could also filter to only include those with abs(L2FC)>0.5, but that might filter out interesting genes -
diffex.mod.short <- subset(res.mod.short, padj < 0.05) # & abs(log2FoldChange)>0.5
diffex.mod.long <- subset(res.mod.long, padj < 0.05)
diffex.sev.short <- subset(res.sev.short, padj < 0.05)
diffex.sev.long <- subset(res.sev.long, padj < 0.05)
Inspect Cook’s distances for all genes.
par(mar=c(8,5,2,2))
boxplot(log10(assays(dds.DESeq.treatment)[["cooks"]]), range=0, las=2)
DESeq2 has integrated outlier detection. But, after inspecting DEG’s, there do appear to be a few influenced by outlier samples. So, I also perform outlier detection using the iterative Leave-One-Out method (iLOO) and script from George et al. 2015, see iLOO.R script
Note: The function iLOO requires an input matrix, which represents a matrix of read counts for a treatment or homogeneous group containing samples and features. iLOO returns a matrix, where only the outlier read counts are present (all non-outlier values have been NA’ed).
#sample.info$treatment %>% levels()
# Create vectors of sample IDs per treatment
samples.ambient <- (sample.info[c("sample", "treatment")] %>% filter(treatment=="ambient"))$sample
samples.moderate_long <- (sample.info[c("sample", "treatment")] %>% filter(treatment=="moderate_long"))$sample
samples.moderate_short <- (sample.info[c("sample", "treatment")] %>% filter(treatment=="moderate_short"))$sample
samples.severe_long <- (sample.info[c("sample", "treatment")] %>% filter(treatment=="severe_long"))$sample
samples.severe_short <- (sample.info[c("sample", "treatment")] %>% filter(treatment=="severe_short"))$sample
# Create vector of all DEGs identified among any contrast
degs <- unique(c(rownames(diffex.mod.short),rownames(diffex.mod.long),rownames(diffex.sev.short), rownames(diffex.sev.long)))
length(degs)
## [1] 2724
# Extract gene counts that have outliers replaced with trimmed means, and filter for DEGs
degs.counts <- assays(dds.DESeq.treatment)[["replaceCounts"]] %>% as.data.frame() %>% rownames_to_column("gene") %>% filter(gene %in% degs)
# Extract counts for
counts.ambient <- degs.counts %>% select(gene, all_of(samples.ambient)) %>% column_to_rownames("gene")
counts.moderate_long <- degs.counts %>% select(gene, all_of(samples.moderate_long)) %>% column_to_rownames("gene")
counts.moderate_short <- degs.counts %>% select(gene, all_of(samples.moderate_short)) %>% column_to_rownames("gene")
counts.severe_long <- degs.counts %>% select(gene, all_of(samples.severe_long)) %>% column_to_rownames("gene")
counts.severe_short <- degs.counts %>% select(gene, all_of(samples.severe_short)) %>% column_to_rownames("gene")
# Run the iterative Leave-One-Out function on gene counts for each treatment
iLOO.ambient <- iLOO(counts.ambient)
iLOO.moderate_long <- iLOO(counts.moderate_long)
iLOO.moderate_short <- iLOO(counts.moderate_short)
iLOO.severe_long <- iLOO(counts.severe_long)
iLOO.severe_short <- iLOO(counts.severe_short)
# find samples with lots of outliers in DEG lists
iLOO.ambient %>% as.data.frame() %>% rownames_to_column("gene") %>%
pivot_longer(-gene) %>% filter(!is.na(value)) %>%
mutate(name=as.factor(name)) %>%
filter(gene %in% c(rownames(diffex.mod.short), rownames(diffex.mod.long),
rownames(diffex.sev.short), rownames(diffex.sev.long))) %>%
group_by(name) %>% tally() %>% arrange(dplyr::desc(n))
## # A tibble: 11 × 2
## name n
## <fct> <int>
## 1 S12 14
## 2 S4 12
## 3 S11 6
## 4 S6 6
## 5 S3 3
## 6 S5 2
## 7 S7 2
## 8 S1 1
## 9 S10 1
## 10 S2 1
## 11 S8 1
iLOO.moderate_long %>% as.data.frame() %>% rownames_to_column("gene") %>%
pivot_longer(-gene) %>% filter(!is.na(value)) %>%
mutate(name=as.factor(name)) %>%
filter(gene %in% c(rownames(diffex.mod.long))) %>%
group_by(name) %>% tally() %>% arrange(dplyr::desc(n))
## # A tibble: 9 × 2
## name n
## <fct> <int>
## 1 S26 40
## 2 S27 40
## 3 S17 13
## 4 S24 12
## 5 S21 3
## 6 S22 3
## 7 S14 1
## 8 S15 1
## 9 S19 1
iLOO.moderate_short %>% as.data.frame() %>% rownames_to_column("gene") %>%
pivot_longer(-gene) %>% filter(!is.na(value)) %>%
mutate(name=as.factor(name)) %>%
filter(gene %in% c(rownames(diffex.mod.short))) %>%
group_by(name) %>% tally() %>% arrange(dplyr::desc(n))
## # A tibble: 8 × 2
## name n
## <fct> <int>
## 1 S32 43
## 2 S38 9
## 3 S31 4
## 4 S28 2
## 5 S30 2
## 6 S36 2
## 7 S35 1
## 8 S37 1
iLOO.severe_short %>% as.data.frame() %>% rownames_to_column("gene") %>%
pivot_longer(-gene) %>% filter(!is.na(value)) %>%
mutate(name=as.factor(name)) %>%
filter(gene %in% c(rownames(diffex.sev.short))) %>%
group_by(name) %>% tally() %>% arrange(dplyr::desc(n))
## # A tibble: 0 × 2
## # … with 2 variables: name <fct>, n <int>
iLOO.severe_long %>% as.data.frame() %>% rownames_to_column("gene") %>%
pivot_longer(-gene) %>% filter(!is.na(value)) %>%
mutate(name=as.factor(name)) %>%
filter(gene %in% c(rownames(diffex.sev.long))) %>%
group_by(name) %>% tally() %>% arrange(dplyr::desc(n))
## # A tibble: 4 × 2
## name n
## <fct> <int>
## 1 S49 8
## 2 S44 6
## 3 S9 5
## 4 S43 4
# Ambient vs. moderate-short
outs.diffex.mod.short <- full_join(
iLOO.ambient[rowSums(is.na(iLOO.ambient)) != ncol(iLOO.ambient), ] %>%
as.data.frame() %>% rownames_to_column("gene") %>%
filter(gene %in% rownames(diffex.mod.short)),
iLOO.moderate_short[rowSums(is.na(iLOO.moderate_short)) != ncol(iLOO.moderate_short), ] %>%
as.data.frame() %>% rownames_to_column("gene") %>%
filter(gene %in% rownames(diffex.mod.short)),
by="gene")
# Ambient vs. moderate-long
outs.diffex.mod.long <- full_join(
iLOO.ambient[rowSums(is.na(iLOO.ambient)) != ncol(iLOO.ambient), ] %>%
as.data.frame() %>% rownames_to_column("gene") %>%
filter(gene %in% rownames(diffex.mod.long)),
iLOO.moderate_long[rowSums(is.na(iLOO.moderate_long)) != ncol(iLOO.moderate_long), ] %>%
as.data.frame() %>% rownames_to_column("gene") %>%
filter(gene %in% rownames(diffex.mod.long)),
by="gene")
# Ambient vs. severe-short
outs.diffex.sev.short <- full_join(
iLOO.ambient[rowSums(is.na(iLOO.ambient)) != ncol(iLOO.ambient), ] %>%
as.data.frame() %>% rownames_to_column("gene") %>%
filter(gene %in% rownames(diffex.sev.short)),
iLOO.severe_short[rowSums(is.na(iLOO.severe_short)) != ncol(iLOO.severe_short), ] %>%
as.data.frame() %>% rownames_to_column("gene") %>%
filter(gene %in% rownames(diffex.sev.short)),
by="gene")
# Ambient vs. secere-short
outs.diffex.sev.long <- full_join(
iLOO.ambient[rowSums(is.na(iLOO.ambient)) != ncol(iLOO.ambient), ] %>%
as.data.frame() %>% rownames_to_column("gene") %>%
filter(gene %in% rownames(diffex.sev.long)),
diffex.sev.long[rowSums(is.na(diffex.sev.long)) != ncol(diffex.sev.long), ] %>%
as.data.frame() %>% rownames_to_column("gene") %>%
filter(gene %in% rownames(diffex.sev.long)),
by="gene")
c(outs.diffex.mod.short$gene,
outs.diffex.mod.short$gene,
outs.diffex.sev.short$gene,
outs.diffex.sev.long$gene) %>% unique() %>% length()
## [1] 592
Remove genes from DEG lists that contain iLOO-identified outliers from DEG objects, and save to file.
(diffex.mod.short.filt <- subset(diffex.mod.short, rownames(diffex.mod.short) %!in% c(outs.diffex.mod.short$gene)))
## log2 fold change (MLE): treatment ambient vs moderate_short
## Wald test p-value: treatment ambient vs moderate_short
## DataFrame with 990 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## gene_22086 163.4332 0.457994 0.141182 3.24400 0.001178650 0.0240594
## gene_17987 265.4243 0.833411 0.229862 3.62571 0.000288172 0.0134225
## gene_15688 114.7739 0.722213 0.208538 3.46321 0.000533762 0.0168004
## gene_24921 120.4863 -0.488948 0.160532 -3.04580 0.002320594 0.0331624
## gene_3171 67.1868 0.566857 0.167939 3.37538 0.000737148 0.0197493
## ... ... ... ... ... ... ...
## gene_3243 7.58741 1.18820 0.387205 3.06867 0.002150175 0.0316878
## gene_14299 7.73821 -1.77428 0.610481 -2.90637 0.003656498 0.0419260
## gene_2281 7.52911 1.24472 0.407544 3.05420 0.002256602 0.0325563
## gene_17639 7.45038 -2.47603 0.743645 -3.32958 0.000869762 0.0209776
## gene_40840 8.76357 1.01936 0.357289 2.85303 0.004330387 0.0452227
(diffex.mod.long.filt <- subset(diffex.mod.long, rownames(diffex.mod.long) %!in% c(outs.diffex.mod.long$gene)))
## log2 fold change (MLE): treatment ambient vs moderate_long
## Wald test p-value: treatment ambient vs moderate_long
## DataFrame with 1854 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## gene_24921 120.4863 -0.667364 0.1539545 -4.33481 1.45885e-05 0.000627676
## gene_41869 56.4835 -0.503796 0.1476097 -3.41303 6.42447e-04 0.008144084
## gene_21363 668.8843 -0.227074 0.0833102 -2.72564 6.41773e-03 0.037799117
## gene_23332 12.8703 0.999718 0.3279471 3.04841 2.30054e-03 0.018799339
## gene_23631 162.9875 -0.643769 0.2265273 -2.84191 4.48446e-03 0.029510837
## ... ... ... ... ... ... ...
## gene_38607 8.70204 0.989363 0.325078 3.04346 0.002338737 0.01902847
## gene_991 9.12787 1.563670 0.491829 3.17929 0.001476350 0.01395797
## gene_23381 8.75754 -1.014250 0.363173 -2.79275 0.005226275 0.03277801
## gene_31445 7.50957 1.053183 0.360920 2.91805 0.003522311 0.02483719
## gene_45082 9.34256 -2.147658 0.579019 -3.70913 0.000207971 0.00384174
(diffex.sev.short.filt <- subset(diffex.sev.short, rownames(diffex.sev.short) %!in% c(diffex.sev.short$gene)))
## log2 fold change (MLE): treatment ambient vs severe_short
## Wald test p-value: treatment ambient vs severe_short
## DataFrame with 47 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## gene_9733 919.971 0.388296 0.1069915 3.62922 2.84282e-04 0.043647033
## gene_17507 364.966 0.910460 0.1742690 5.22445 1.74671e-07 0.000342006
## gene_20451 794.996 -0.351901 0.0955198 -3.68406 2.29546e-04 0.040859221
## gene_25779 1048.888 1.408023 0.3499245 4.02379 5.72687e-05 0.021768821
## gene_30442 581.969 -0.500407 0.1449462 -3.45236 5.55703e-04 0.048565551
## ... ... ... ... ... ... ...
## gene_14493 46116.276 -0.406327 0.114958 -3.53459 0.000408415 0.0436470
## gene_22162 3319.819 0.484070 0.140261 3.45121 0.000558082 0.0485656
## gene_28677 1146.876 0.688146 0.193193 3.56196 0.000368090 0.0436470
## gene_18867 10148.797 -0.393208 0.109324 -3.59671 0.000322267 0.0436470
## gene_5901 857.875 -0.587951 0.167998 -3.49975 0.000465697 0.0455917
(diffex.sev.long.filt <- subset(diffex.sev.long, rownames(diffex.sev.long) %!in% c(diffex.sev.long$gene)))
## log2 fold change (MLE): treatment ambient vs severe_long
## Wald test p-value: treatment ambient vs severe_long
## DataFrame with 526 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## gene_38613 282.1055 0.362134 0.120267 3.01108 2.60324e-03 0.04735073
## gene_12130 48.2706 0.944692 0.237722 3.97393 7.06971e-05 0.00879515
## gene_19035 452.3958 -0.784800 0.168862 -4.64760 3.35822e-06 0.00131993
## gene_21303 179.8877 -0.406202 0.135245 -3.00346 2.66924e-03 0.04806765
## gene_28091 351.8465 -0.326202 0.108757 -2.99935 2.70557e-03 0.04839945
## ... ... ... ... ... ... ...
## gene_33802 186.7508 0.716785 0.230807 3.10556 1.89918e-03 0.04090794
## gene_20756 61.1330 1.023421 0.291222 3.51423 4.41025e-04 0.01981272
## gene_45528 432.3796 1.956336 0.491196 3.98280 6.81071e-05 0.00879515
## gene_11617 362.7768 0.476949 0.140959 3.38359 7.15442e-04 0.02453651
## gene_39972 24.4192 1.726614 0.523343 3.29920 9.69606e-04 0.02921745
save(diffex.mod.short.filt, file = "../results/deseq2/diffex.mod.short.filt")
save(diffex.mod.long.filt, file = "../results/deseq2/diffex.mod.long.filt")
save(diffex.sev.short.filt, file = "../results/deseq2/diffex.sev.short.filt")
save(diffex.sev.long.filt, file = "../results/deseq2/diffex.sev.long.filt")
Plot counts NOTE: using the DeSeq2 function plotCounts plots original counts, i.e. those that are normalized, but outlier values are NOT replaced with trimmed means. I’m really interested to see how the counts look that DESeq2 is analyzing, i.e. I want to see the outlier-replaced counts. So, I must include the argument replaced=TRUE in the plotCounts function.
It’s good to look at the gene expression value - if counts are low (e.g. less than 50 or so) that gene isn’t as interesting as one with mucn larger counts!
## # A tibble: 15 × 5
## gene_id padj spid evalue protein_names
## <chr> <dbl> <chr> <dbl> <chr>
## 1 gene_9105 0.00156 Q08603 7.12e-29 Geranylgeranyl transferase type-2 subunit…
## 2 gene_31640 0.00156 P62083 6.7 e-23 Small ribosomal subunit protein eS7 (40S …
## 3 gene_26230 0.00156 Q9XYN1 7.23e-27 Innexin inx2 (Innexin-2) (G-Inx2)
## 4 gene_1280 0.00212 Q962X9 2.69e-25 Protein BUD31 homolog (Protein G10 homolo…
## 5 gene_21431 0.00283 A7SMW7 5.36e-18 L-2-hydroxyglutarate dehydrogenase, mitoc…
## 6 gene_27663 0.00295 Q8C428 9.39e-18 Transmembrane channel-like protein 7
## 7 gene_19111 0.00295 Q9UL12 6.96e-66 Sarcosine dehydrogenase, mitochondrial (S…
## 8 gene_19034 0.00406 Q9ULK2 1.99e-13 Ataxin-7-like protein 1 (Ataxin-7-like pr…
## 9 gene_3111 0.00406 P21328 1.22e-13 RNA-directed DNA polymerase from mobile e…
## 10 gene_3494 0.00406 Q5ZM14 1.28e-16 Na(+)/H(+) exchange regulatory cofactor N…
## 11 gene_38520 0.00406 Q642P2 1.81e-18 Protein dopey-2
## 12 gene_29559 0.00433 Q8R420 3.03e-35 Phospholipid-transporting ATPase ABCA3 (E…
## 13 gene_39860 0.00433 Q9MAQ3 5.95e-20 Putative ubiquitin carboxyl-terminal hydr…
## 14 gene_31413 0.00468 P0CB99 1.87e-11 NADH dehydrogenase [ubiquinone] 1 alpha s…
## 15 gene_12849 0.00477 Q94793 6.66e-24 Large ribosomal subunit protein uL5 (60S …
## # A tibble: 15 × 5
## gene_id padj spid evalue protein_names
## <chr> <dbl> <chr> <dbl> <chr>
## 1 gene_30576 2.62e-27 Q4R6M5 2.49e- 20 Probable ATP-dependent RNA helicase DDX…
## 2 gene_8881 6.75e-16 P30414 7.58e- 15 NK-tumor recognition protein (NK-TR pro…
## 3 gene_13021 1.98e-14 Q5RC80 4.07e- 31 RNA-binding protein 39 (RNA-binding mot…
## 4 gene_11011 2.98e-13 Q52KI8 1.87e- 13 Serine/arginine repetitive matrix prote…
## 5 gene_28510 2.98e-13 Q02040 4.41e- 15 A-kinase anchor protein 17A (AKAP-17A) …
## 6 gene_44742 6.53e-12 Q92734 4.8 e- 13 Protein TFG (TRK-fused gene protein)
## 7 gene_14632 4.51e-10 Q9UKX7 4.87e- 24 Nuclear pore complex protein Nup50 (50 …
## 8 gene_19734 3.11e- 9 A7MB89 3.69e- 72 Protein fem-1 homolog C (FEM1c) (FEM1-g…
## 9 gene_17622 5.48e- 9 Q9I8F9 0 Heat shock 70 kDa protein 1 (HSP70-1)
## 10 gene_13542 1.61e- 8 Q9I8F9 0 Heat shock 70 kDa protein 1 (HSP70-1)
## 11 gene_28605 1.61e- 8 P53486 2.06e-140 Actin, cytoplasmic 3 (EC 3.6.4.-) (Beta…
## 12 gene_2928 9.78e- 8 A2VEY9 1.02e- 18 Palmitoyltransferase app (EC 2.3.1.225)…
## 13 gene_29217 1.73e- 7 Q02290 1.21e- 74 Endo-1,4-beta-xylanase B (Xylanase B) (…
## 14 gene_25041 2.32e- 7 Q71SY6 5.11e- 15 Out at first protein homolog (Protein V…
## 15 gene_33947 2.88e- 7 P52485 1.06e- 33 Ubiquitin-conjugating enzyme E2-24 kDa …
## # A tibble: 15 × 5
## gene_id padj spid evalue protein_names
## <chr> <dbl> <chr> <dbl> <chr>
## 1 gene_45978 0.0000930 Q02375 3.68e-12 NADH dehydrogenase [ubiquinone] iron-su…
## 2 gene_17507 0.000342 Q25434 6.64e-24 Adhesive plaque matrix protein (Foot pr…
## 3 gene_4166 0.00623 P31161 9.51e-42 Superoxide dismutase [Mn] 1, mitochondr…
## 4 gene_42525 0.00637 P19835 9.27e-12 Bile salt-activated lipase (BAL) (EC 3.…
## 5 gene_45077 0.00637 P24156 4.59e-12 Prohibitin 1 (Protein l(2)37Cc)
## 6 gene_38799 0.00637 O00871 2.22e-14 Phosphoglycerate kinase (EC 2.7.2.3)
## 7 gene_41481 0.00637 Q6PB44 3.91e-18 Tyrosine-protein phosphatase non-recept…
## 8 gene_16724 0.0218 Q96PE7 4.35e-25 Methylmalonyl-CoA epimerase, mitochondr…
## 9 gene_14971 0.0316 Q9VNX8 5.28e-14 Eukaryotic translation initiation facto…
## 10 gene_34598 0.0351 A4Z944 4.98e-21 Zinc finger BED domain-containing prote…
## 11 gene_32125 0.0372 Q96P09 1.82e-13 Baculoviral IAP repeat-containing prote…
## 12 gene_2973 0.0372 Q9VJ38 2.75e-21 Large ribosomal subunit protein uL13m (…
## 13 gene_37254 0.0372 Q1HPL8 3.38e-12 NADH dehydrogenase [ubiquinone] 1 beta …
## 14 gene_20451 0.0409 Q02375 3.68e-12 NADH dehydrogenase [ubiquinone] iron-su…
## 15 gene_40464 0.0409 Q5D891 4.67e-87 Phosphatidylinositol 3-kinase catalytic…
## # A tibble: 15 × 5
## gene_id padj spid evalue protein_names
## <chr> <dbl> <chr> <dbl> <chr>
## 1 gene_39883 0.00000883 Q8BTN6 1.51e-25 Leukocyte receptor cluster member 9
## 2 gene_11011 0.0000211 Q52KI8 1.87e-13 Serine/arginine repetitive matrix prot…
## 3 gene_14632 0.0000220 Q9UKX7 4.87e-24 Nuclear pore complex protein Nup50 (50…
## 4 gene_1130 0.0000256 Q5R685 9.65e-43 Phosphatidylinositol 3-kinase regulato…
## 5 gene_21525 0.0000805 G3X9Z4 2.73e-16 Pre-mRNA cleavage complex 2 protein Pc…
## 6 gene_13021 0.000264 Q5RC80 4.07e-31 RNA-binding protein 39 (RNA-binding mo…
## 7 gene_32373 0.000362 Q9USH8 1.64e-18 Glucosidase 2 subunit beta (Alpha-gluc…
## 8 gene_3253 0.000544 Q0KK59 6.26e-15 Protein unc-79 homolog
## 9 gene_46423 0.000833 F4ILR7 8.28e-21 DExH-box ATP-dependent RNA helicase DE…
## 10 gene_13289 0.000997 Q7K3M5 3.45e-65 ATP-dependent RNA helicase DHX15 homol…
## 11 gene_30576 0.00112 Q4R6M5 2.49e-20 Probable ATP-dependent RNA helicase DD…
## 12 gene_29874 0.00123 P35509 4.34e-56 Casein kinase I isoform gamma-3 (CKI-g…
## 13 gene_19035 0.00132 O14730 5.68e-17 Serine/threonine-protein kinase RIO3 (…
## 14 gene_21288 0.00149 Q9NDJ2 2.08e-59 Helicase domino (EC 3.6.4.-)
## 15 gene_2928 0.00155 A2VEY9 1.02e-18 Palmitoyltransferase app (EC 2.3.1.225…
# write simple function to return the name of an object as a string
# Here is a list containing all DEG sets - this will be used in subsequent code chunks, too!
degs <- list(
diffex.mod.short.filt,
diffex.mod.long.filt,
diffex.sev.short.filt,
diffex.sev.long.filt)
names(degs) <- c("Moderate OA - Short", "Moderate OA - Long",
"Severe OA - Short", "Severe OA - Long")
n.degs <- data.frame(matrix(NA, nrow = length(degs), ncol = 3))
names(n.degs) <- c("contrast", "n.degs", "perc")
for (i in (1:length(degs))) {
n.degs[i,1] <- names(degs[i])
n.degs[i,2] <- nrow(degs[[i]])
n.degs[i,3] <- 100*round(nrow(degs[[i]])/ncol(counts.ts), digits = 3)
}
n.degs$contrast <- factor(n.degs$contrast, levels=n.degs$contrast, ordered = T)
save(degs, file = "../results/deseq2/degs")
Barplot of the number of DEGs
# single stressors vs. multiple stressors
#pdf(file="../results/deseq2/DEGs_barplot_single-vs-double-stressor.pdf", height = 2, width = 7)
ggplot(n.degs,
# %>%
# filter(contrast %in% c("Amb vs. Low pH @6C",
# "3 vs. 6C @Amb pH", "6 amb vs. 3C low",
# "6 vs. 10C @Amb pH", "6 amb vs. 10 low")) %>%
# mutate(effect_of=case_when(
# grepl("Amb vs. Low pH @6C", contrast) ~ "High pCO2",
# grepl("3 vs. 6C", contrast) ~ "Cold temperature",
# grepl("6 amb vs. 3C low", contrast) ~ "Cold temperature + High pCO2",
# grepl("6 amb vs. 10 low", contrast) ~ "Warm temperature + High pCO2",
# grepl("6 vs. 10C", contrast) ~ "Warm temperature")) %>%
# mutate(effect_of=fct_relevel(effect_of,
# c("Cold temperature + High pCO2", "Cold temperature",
# "Warm temperature + High pCO2", "Warm temperature",
# "High pCO2"))),
aes(x=contrast, y=n.degs, fill=contrast)) +
geom_bar(alpha=0.8, stat = "identity", color="gray20", size=0.1) +
geom_text(hjust=-0.1, size=3.5, aes(label=paste(comma(n.degs), " (", perc, "%)", sep=""))) +
xlab("Contrast") + ylab("Number Differentially Expressed Genes (% of all genes)") +
xlab(NULL) + ylab(NULL) + coord_flip() + ggtitle("Number Differentially Expressed Genes (% of all genes)") +
theme_minimal() + ylim(c(0,2150)) +
theme(legend.position = "none", plot.title = element_text(size = 10)) +
scale_fill_manual(name="Treatment",
values=c("Moderate OA - Short"="darkgreen",
"Moderate OA - Long"="yellow3",
"Severe OA - Short"="sienna2",
"Severe OA - Long"="firebrick4"),
labels=c("Moderate OA - Short"="pH 7.8, Short Exposure",
"Moderate OA - Long"="pH 7.8, Long Exposure",
"Severe OA - Short"="pH 7.5, Short Exposure",
"Severe OA - Long"="pH 7.5, Long Exposure")) +
guides(fill=guide_legend(nrow=3,byrow=TRUE,reverse=TRUE))
#dev.off()
Regenerate barplot for paper (Figure 3) with the number of upregulated / downregualted DEGs by treatment
# summary(res.mod.short)
# summary(res.mod.long)
# summary(res.sev.short)
# summary(res.sev.long)
degs.updown <- list(
diffex.mod.short.filt %>% data.frame() %>% filter(log2FoldChange > 0),
diffex.mod.short.filt %>% data.frame() %>% filter(log2FoldChange < 0),
diffex.mod.long.filt %>% data.frame() %>% filter(log2FoldChange > 0),
diffex.mod.long.filt %>% data.frame() %>% filter(log2FoldChange < 0),
diffex.sev.short.filt %>% data.frame() %>% filter(log2FoldChange > 0),
diffex.sev.short.filt %>% data.frame() %>% filter(log2FoldChange < 0),
diffex.sev.long.filt %>% data.frame() %>% filter(log2FoldChange > 0),
diffex.sev.long.filt %>% data.frame() %>% filter(log2FoldChange < 0))
names(degs.updown) <- c("Moderate OA - Short", "Moderate OA - Short",
"Moderate OA - Long", "Moderate OA - Long",
"Severe OA - Short", "Severe OA - Short",
"Severe OA - Long", "Severe OA - Long")
n.degs.updown <- data.frame(matrix(NA, nrow = length(degs.updown), ncol = 3))
names(n.degs.updown) <- c("stressor", "n.degs", "perc")
for (i in (1:length(degs.updown))) {
n.degs.updown[i,1] <- names(degs.updown[i])
n.degs.updown[i,2] <- nrow(degs.updown[[i]])
n.degs.updown[i,3] <- 100*round(nrow(degs.updown[[i]])/ncol(counts.ts), digits = 2)
}
n.degs.updown <- n.degs.updown %>%
mutate(stressor=factor(stressor, levels=
c("Moderate OA - Short", "Moderate OA - Long",
"Severe OA - Short", "Severe OA - Long")),
response=rep(c("up", "down"), times=4))
# Barplot with no. of DEGs by single stressors vs. multiple stressors, including up/down regulation
#pdf(file="../results/deseq2/DEGs_barplot_single-vs-double-stressor-up-down.pdf", height = 2, width = 7)
ggplot(n.degs.updown,
aes(x=stressor, y=n.degs, fill=stressor)) +
geom_bar_pattern(aes(pattern=response), alpha=0.75, stat = "identity", color="gray20", size=0.05,
pattern_fill="black", pattern_alpha=0.2, pattern_density=0.01, pattern_spacing=0.025) +
xlab("Contrast") + ylab("Number Differentially Expressed Genes (% of all genes)") +
xlab(NULL) + ylab(NULL) + coord_flip() + ggtitle("Number Differentially Expressed Genes (% of all genes)") +
theme_pubr() + scale_y_continuous(label=comma, limits = c(0,2000)) +
theme(legend.position = "none", plot.title = element_text(size = 10)) +
scale_fill_manual(name="Treatment",
values=c("Moderate OA - Short"="darkgreen",
"Moderate OA - Long"="yellow3",
"Severe OA - Short"="sienna2",
"Severe OA - Long"="firebrick4"),
labels=c("Moderate OA - Short"="pH 7.8, Short Exposure",
"Moderate OA - Long"="pH 7.8, Long Exposure",
"Severe OA - Short"="pH 7.5, Short Exposure",
"Severe OA - Long"="pH 7.5, Long Exposure")) +
scale_pattern_manual(values = c("stripe","none")) +
guides(fill=guide_legend(nrow=3,byrow=TRUE,reverse=TRUE))
#dev.off()
# Generate counts matrix With DEGs among any treatment
diffex.counts <- subset(assay(vsd.treatment), rownames(dds.DESeq.treatment) %in% unique(c(rownames(diffex.mod.short.filt),rownames(diffex.mod.long.filt),
rownames(diffex.sev.short.filt), rownames(diffex.sev.long.filt))))
# Create a dataframe to annotate heatmap with treatments
dds.df <- sample.info[match(colnames(diffex.counts),
sample.info$sampleID),c("sampleID", "treatment")] %>%
remove_rownames() %>% column_to_rownames(var = "sampleID")
# Make sure treatment order is correct
all(colnames(diffex.counts) == rownames(dds.df)) #double check that samples are still in same order
## [1] TRUE
# All DEGs among any pH treatment (within temperatures)
pheatmap(diffex.counts, cluster_rows=TRUE, cluster_cols=FALSE,
show_rownames=FALSE, na.rm=TRUE, annotation_colors = list(treatment=
c("ambient"= "navyblue", "moderate_short"="darkgreen",
"moderate_long"="yellow3","severe_short"="sienna2",
"severe_long"="firebrick4")),
scale="row", main = "Differentially expressed genes (DEGs) among any pH treatment",
annotation_col=dds.df[,"treatment", drop=FALSE], fontsize = 8,
#cutree_rows = 2,
border_color=F)
It is a little clunky, but the DAVID functional analysis tool is one of the best ways to perform enrichment analysis (that I have found to date). To use, you copy a list of Uniprot IDs (or other gene identifier) to clipboard, then paste it into their tool. You then do the same for all genes in the analysis, providing an accurate background list of genes. Here is code to copy all genes, then upregulated and downregulated DEGs identified by each contrast. For background on what an enrichment analysis is, check out this great video.
DEGs.4David <- list(
"moderate_short" =
enrich.mod.short %>%
dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector(),
"moderate_long" =
enrich.mod.long %>%
dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector(),
"severe_short" =
enrich.sev.short %>%
dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector(),
"severe_long" =
enrich.sev.long %>%
dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector()
)
# Write tab-delimited file with each column containing Uniprot IDs for each set of DEGs
write_delim(x = plyr::ldply(DEGs.4David, rbind) %>% t() %>% as.data.frame(), delim = "\t", col_names = F, na="",
file = "../results/deseq2/DEGs-4David.txt")
# BACKGROUND - all genes submitted to DESeq2 analysis
enrich.background %>% write_clip(allow_non_interactive = TRUE)
NOTE: outside of R I created a .tab file with the first column containing file names for enrichment results, and second column containing the contrast + which treatment was upregulated (for grouping in R). Example: the following row refers to the file contains enrichment results for genes upregulated in 10C in the contrast 6C vs. 10C at Low pH
#### Read in enriched biological functions for all contrasts
filenames <- read_delim(file="../results/deseq2/GO_filenames.txt", delim = "\t", col_names = c("filename", "gene_set"))
file_list <- vector(mode = "list", length = nrow(filenames))
names(file_list) <- c(filenames$gene_set)
for (i in 1:nrow(filenames)) {
file_list[[i]] <- data.frame(read_delim(file=paste("../results/deseq2/", filenames[i,"filename"], sep=""), delim = "\t"))}
david <- bind_rows(file_list, .id = "gene_set") %>% clean_names() %>% # Bind dataframe for each gene_set together
#filter(category=="GOTERM_BP_DIRECT") %>%
filter(p_value<0.05) %>%
dplyr::rename(percent=x) %>%
separate(term, into = c("term", "process"), sep = "~") %>%
separate(gene_set, sep = "_", into = c("OA", "duration"), remove = F) %>%
mutate(treatment=factor(gene_set, ordered = TRUE, levels=c("moderate_short","moderate_long", "severe_short", "severe_long")),
OA=as.factor(OA), duration=as.factor(duration)) %>%
tidyr::complete(OA, duration, category) # add rows for gene_sets that are missing from dataframe
save(david, file="../results/deseq2/david")
# Uniprot keywords
david %>%
filter(p_value<0.05) %>%
filter(category=="UP_KW_BIOLOGICAL_PROCESS") %>%
arrange(OA, duration, p_value) %>%
mutate_at(c("p_value", "fdr"), funs(signif(.,2))) %>%
dplyr::select(treatment, OA, duration, term, process, count, p_value, fdr, genes) %>%
write_clip(allow_non_interactive = TRUE)
# Gene Ontology terms
david %>%
filter(p_value<0.05) %>%
filter(category=="GOTERM_BP_DIRECT") %>%
arrange(OA, duration, p_value) %>%
mutate_at(c("p_value", "fdr"), funs(signif(.,2))) %>%
dplyr::select(treatment, OA, duration, term, process, count, p_value, fdr, genes) %>%
write_clip(allow_non_interactive = TRUE)
#pdf(file="../results/deseq2/KW-bubble.pdf", height = 3.05, width = 4)
david %>% #View()
#filter(count>=3) %>% # use this to filter for processes that have at minimum n genes
filter(p_value<0.05) %>% #filter enrichment results for p-value
# Option 1: Use this line for gene ontology biological processes
filter(category=="GOTERM_BP_DIRECT") %>%
# Option 2: Use this line for Uniprot Keywords
#filter(category=="UP_KW_BIOLOGICAL_PROCESS") %>%
# Can filter for other options in the "category" column, but might need to adjust the y= in the ggplot(aes).
# For example would need to use y=term for KEGG_PATHWAY
group_by(treatment, genes, count) %>%
dplyr::summarise(p_value=min(p_value)) %>% distinct() %>% ungroup() %>% #for rows with duplicate grouping variabes, select one with lowest p-value
left_join(david %>%
dplyr::select(treatment, category, term, process, count, percent, p_value, fold_enrichment, fdr, genes),
by = c("treatment", "genes", "p_value", "count")) %>% #re-add data
# ungroup() %>% arrange(stress, response, p_value) %>% group_by(stress, response) %>% dplyr::slice(1:15) %>%
# filter(response=="Upregulated") %>%
ungroup() %>% arrange(treatment, p_value) %>%
mutate(process = factor(process, rev(unique(process)), ordered = TRUE)) %>%
ggplot(aes(y = process, x=treatment, col=treatment)) +
geom_point(alpha=0.65, aes(size=-log10(p_value))) + #size=count, alpha=p_value,
facet_wrap(~category,scales="free", nrow = 2) +
scale_color_manual(name="Treatment",
values=c("moderate_short"=lighten("darkgreen", 0.25),
"moderate_long"=lighten("yellow3", 0.25),
"severe_short"=lighten("sienna2"),
"severe_long"=lighten("firebrick4")),
labels=c("moderate_short"="pH 7.8, Short Exposure",
"moderate_long"="pH 7.8, Long Exposure",
"severe_short"="pH 7.5, Short Exposure",
"severe_long"="pH 7.5, Long Exposure"),
guide = guide_legend(override.aes = list(size=4))) +
scale_size("-Log10 P-value", range = c(2,8), #breaks = c(2, 5, 10),
guide = guide_legend(override.aes = list(col="gray50"))) +
scale_x_discrete(drop=T) + #Do drop empty factors for temperature contrasts
theme_cleveland() +
theme(
legend.position = "right",
axis.text.x=element_blank(),
axis.title.x=element_blank(),
axis.ticks.x = element_blank(),
axis.text.y=element_text(size=7.25),
plot.title = element_text(size=8),
legend.text=element_text(size=6),
legend.title = element_text(size=6),
strip.background = element_blank(),
strip.text.x = element_blank()) +
ggtitle("Enriched Biological Processes")
#dev.off()
First, determine which log-fold change (LFC) direction (>0 or <0) refers to higher/lower expression for treatments in each contrast
test <- ((
# enrich.mod.short %>%
# enrich.mod.long %>%
# enrich.sev.short %>%
enrich.sev.long %>%
# filter(select(., contains("log2FoldChange"))>.5))$gene_id)
filter(select(., contains("log2FoldChange"))<(-0.5)))$gene_id)
# Here are treatments
# "ambient" "moderate_long" "moderate_short" "severe_long" "severe_short"
a <- c("ambient", "moderate_short")
c <- data.frame(gene=character(), treatment=character(), mean=numeric())
d <- list()
for (i in 1:length(test)) {
b <- plotCounts(dds.DESeq.treatment, gene=test[i], intgroup=c("treatment"), replaced = TRUE, returnData = TRUE) %>%
rownames_to_column("sampleID") %>%
left_join(sample.info[,c("sampleID", "treatment")]) %>%
filter(treatment %in% a)
c[1:2,1] <- test[i]
c[1:2,2:3] <- summarySE(measurevar = "count", groupvars = "treatment", data = b)[,c("treatment", "count")] %>%
mutate(treatment=as.character(treatment))
d[[i]] <- c
}
bind_rows(d, .id = "i") %>%
mutate(treat.num=factor(treatment, levels = a, ordered = T),
gene=as.factor(gene)) %>%
ggplot(aes(x=treatment, y=log(mean, base = 2), group=gene)) + geom_line(size=.25, color="gray50") +
theme_minimal() +
#scale_x_continuous(breaks=c(1,2), labels=a) +
theme(axis.title.x = element_blank(), plot.title = element_text(size=10), axis.text.x = element_text(hjust = 0.2))
# RESULTS
# enrich.mod.short - LFC > 0.5 - LOWER IN OA
# enrich.mod.short - LFC < (-0.5) - HIGHER IN OA
# enrich.mod.long - LFC > 0.5 - LOWER IN OA
# enrich.mod.long - LFC < (-0.5) - HIGHER IN OA
# enrich.sev.short - LFC > 0.5 - LOWER IN OA
# enrich.sev.short - LFC < (-0.5) - HIGHER IN OA
# enrich.sev.long - LFC > 0.5 - LOWER IN OA
# enrich.sev.long - LFC < (-0.5) - HIGHER IN OA
This file lists DEGs that are upregulated/downregulated in response to each treatment for DAVID enrichment analysis
# RESULTS
DEGs.updown.4David <- list(
"moderate_short_downregulated" =
# enrich.mod.short - LFC > 0.5 - LOWER IN OA
enrich.mod.short %>%
filter(dplyr::select(., contains("log2FoldChange"))>0.5) %>%
dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector(),
"moderate_short_upregulated" =
# enrich.mod.short - LFC < (-0.5) - HIGHER IN OA
enrich.mod.short %>%
filter(dplyr::select(., contains("log2FoldChange"))<(-0.5)) %>%
dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector(),
"moderate_long_downregulated" =
# enrich.mod.long - LFC > 0.5 - LOWER IN OA
enrich.mod.long %>%
filter(dplyr::select(., contains("log2FoldChange"))>0.5) %>%
dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector(),
"moderate_long_upregulated" =
# enrich.mod.long - LFC < (-0.5) - HIGHER IN OA
enrich.mod.long %>%
filter(dplyr::select(., contains("log2FoldChange"))<(-0.5)) %>%
dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector(),
"severe_short_downregulated" =
# enrich.sev.short - LFC > 0.5 - LOWER IN OA
enrich.sev.short %>%
filter(dplyr::select(., contains("log2FoldChange"))>0.5) %>%
dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector(),
"severe_short_upregulated" =
# enrich.sev.short - LFC < (-0.5) - HIGHER IN OA
enrich.sev.short %>%
filter(dplyr::select(., contains("log2FoldChange"))<(-0.5)) %>%
dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector(),
"severe_long_downregulated" =
# enrich.sev.long - LFC > 0.5 - LOWER IN OA
enrich.sev.long %>%
filter(dplyr::select(., contains("log2FoldChange"))>0.5) %>%
dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector(),
"severe_long_upregulated" =
# enrich.sev.long - LFC < (-0.5) - HIGHER IN OA
enrich.sev.long %>%
filter(dplyr::select(., contains("log2FoldChange"))<(-0.5)) %>%
dplyr::select(spid) %>% na.omit() %>% unlist() %>% as.vector()
)
# Write tab-delimited file with each column containing Uniprot IDs for each set of DEGs
write_delim(x = ldply(DEGs.updown.4David, rbind) %>% t() %>% as.data.frame(), delim = "\t", col_names = F, na="",
file = "../results/deseq2/DEGs-updown-4David.txt")
# Copy the BACKGROUND - all genes submitted to DESeq2 analysis
enrich.background %>% write_clip(allow_non_interactive = TRUE)
filenames_updown <- read_delim(file="../results/deseq2/GO_filenames_updown.txt", delim = "\t", col_names = c("filename", "gene_set"))
file_list_updown <- vector(mode = "list", length = nrow(filenames_updown))
names(file_list_updown) <- c(filenames_updown$gene_set)
for (i in 1:nrow(filenames_updown)) {
file_list_updown[[i]] <- data.frame(read_delim(file=paste("../results/deseq2/", filenames_updown[i,"filename"], sep=""), delim = "\t"))}
david_updown <- bind_rows(file_list_updown, .id = "gene_set") %>% clean_names() %>% # Bind dataframe for each gene_set together
#filter(category=="GOTERM_BP_DIRECT") %>%
filter(p_value<0.05) %>%
dplyr::rename(percent=x) %>%
separate(term, into = c("term", "process"), sep = "~") %>%
separate(gene_set, sep = "_", into = c("OA", "duration", "response"), remove = F) %>%
mutate(treatment= gsub("_upregulated|_downregulated", "", gene_set) %>% factor(., ordered = TRUE, levels=c("moderate_short","moderate_long", "severe_short", "severe_long")),
OA=as.factor(OA), duration=as.factor(duration), response=as.factor(response)) %>%
tidyr::complete(OA, duration, response, category) # add rows for gene_sets that are missing from dataframe
save(david_updown, file="../results/deseq2/david_updown")
# Bubble plot, Upregulated processes
#pdf(file="../results/GO-enrichment/Final-DAVID/Upregulated-GO-bubble.pdf", height = 9, width = 7.5)
david_updown %>%
# filter(count>=3) %>%
filter(p_value<0.05) %>% filter(count>=3) %>%
filter(category=="GOTERM_BP_DIRECT") %>%
# filter(category=="UP_KW_BIOLOGICAL_PROCESS") %>%
group_by(treatment, response, genes, count) %>%
dplyr::summarise(p_value=min(p_value)) %>% distinct() %>% ungroup() %>% #for rows with duplicate grouping variabes, select one with lowest p-value
left_join(david_updown %>%
dplyr::select(treatment, response, category, term, process, count, percent, p_value, fold_enrichment, fdr, genes),
by = c("treatment", "response", "genes", "p_value", "count")) %>% #re-add data
# ungroup() %>% arrange(stress, response, p_value) %>% group_by(stress, response) %>% dplyr::slice(1:15) %>%
filter(response=="upregulated") %>%
ungroup() %>% arrange(treatment, p_value) %>%
mutate(process = factor(process, rev(unique(process)), ordered = TRUE)) %>%
ggplot(aes(y = process, x=treatment, col=treatment)) +
geom_point(alpha=0.65, aes(size=-log10(p_value))) + #size=count, alpha=p_value,
facet_wrap(~category,scales="free", nrow = 2) +
scale_color_manual(name="Treatment",
values=c("moderate_short"=lighten("darkgreen", 0.25),
"moderate_long"=lighten("yellow3", 0.25),
"severe_short"=lighten("sienna2"),
"severe_long"=lighten("firebrick4")),
labels=c("moderate_short"="pH 7.8, Short Exposure",
"moderate_long"="pH 7.8, Long Exposure",
"severe_short"="pH 7.5, Short Exposure",
"severe_long"="pH 7.5, Long Exposure"),
guide = guide_legend(override.aes = list(size=4))) +
scale_size("-Log10 P-value", range = c(2,8), #breaks = c(2, 5, 10),
guide = guide_legend(override.aes = list(col="gray50"))) +
scale_x_discrete(drop=T) + #Do drop empty factors for temperature contrasts
theme_cleveland() +
theme(
legend.position = "right",
axis.text.x=element_blank(),
axis.title.x=element_blank(),
axis.ticks.x = element_blank(),
axis.text.y=element_text(size=7.25),
plot.title = element_text(size=8),
legend.text=element_text(size=6),
legend.title = element_text(size=6),
strip.background = element_blank(),
strip.text.x = element_blank()) +
ggtitle("Enriched Biological Processes\nin UPREGULATED genes")
Enriched terms in DOWNREGULATED genes
# Bubble plot, Upregulated processes
#pdf(file="../results/GO-enrichment/Final-DAVID/Upregulated-GO-bubble.pdf", height = 9, width = 7.5)
david_updown %>%
# filter(count>=3) %>%
filter(p_value<0.05) %>% filter(count>=3) %>%
filter(category=="GOTERM_BP_DIRECT") %>%
#filter(category=="UP_KW_BIOLOGICAL_PROCESS") %>%
group_by(treatment, response, genes, count) %>%
dplyr::summarise(p_value=min(p_value)) %>% distinct() %>% ungroup() %>% #for rows with duplicate grouping variabes, select one with lowest p-value
left_join(david_updown %>%
dplyr::select(treatment, response, category, term, process, count, percent, p_value, fold_enrichment, fdr, genes),
by = c("treatment", "response", "genes", "p_value", "count")) %>% #re-add data
# ungroup() %>% arrange(stress, response, p_value) %>% group_by(stress, response) %>% dplyr::slice(1:15) %>%
filter(response=="downregulated") %>%
ungroup() %>% arrange(treatment, p_value) %>%
mutate(process = factor(process, rev(unique(process)), ordered = TRUE)) %>%
ggplot(aes(y = process, x=treatment, col=treatment)) +
geom_point(alpha=0.65, aes(size=-log10(p_value))) + #size=count, alpha=p_value,
facet_wrap(~category,scales="free", nrow = 2) +
scale_color_manual(name="Treatment",
values=c("moderate_short"=lighten("darkgreen", 0.25),
"moderate_long"=lighten("yellow3", 0.25),
"severe_short"=lighten("sienna2"),
"severe_long"=lighten("firebrick4")),
labels=c("moderate_short"="pH 7.8, Short Exposure",
"moderate_long"="pH 7.8, Long Exposure",
"severe_short"="pH 7.5, Short Exposure",
"severe_long"="pH 7.5, Long Exposure"),
guide = guide_legend(override.aes = list(size=4))) +
scale_size("-Log10 P-value", range = c(2,8), #breaks = c(2, 5, 10),
guide = guide_legend(override.aes = list(col="gray50"))) +
scale_x_discrete(drop=T) + #Do drop empty factors for temperature contrasts
theme_cleveland() +
theme(
legend.position = "right",
axis.text.x=element_blank(),
axis.title.x=element_blank(),
axis.ticks.x = element_blank(),
axis.text.y=element_text(size=7.25),
plot.title = element_text(size=8),
legend.text=element_text(size=6),
legend.title = element_text(size=6),
strip.background = element_blank(),
strip.text.x = element_blank()) +
ggtitle("Enriched Biological Processes\nin DOWNREGULATED genes")